library(corrplot)
## corrplot 0.92 loaded
library(gridExtra)
library(data.table)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between() masks data.table::between()
## x dplyr::combine() masks gridExtra::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(dplyr)
library(tidyr)
library(ggplot2)
library(plotly)
##
## Attachement du package : 'plotly'
## L'objet suivant est masqué depuis 'package:ggplot2':
##
## last_plot
## L'objet suivant est masqué depuis 'package:stats':
##
## filter
## L'objet suivant est masqué depuis 'package:graphics':
##
## layout
library(knitr)
library(DT)
library(pals)
df_claims = read.csv("14OURE-PG_2017_CLAIMS_YEAR0.csv")
df_year0 = read.csv("14OURE-PG_2017_YEAR0.csv")
df_year1 = read.csv("14OURE-PG_2017_YEAR1.csv")
df_regions = read.csv("departements_francais.csv", sep = ";") %>% dplyr::select(NUMERO, REGION, "DENSITE..habitants.km2.")
df_year0 = df_year0 %>% mutate(NUMERO = substr(df_year0$pol_insee_code,1,2))
df_year0 = left_join(df_year0, df_regions, by = "NUMERO" )
df_year0$DENSITE..habitants.km2. <- as.numeric(gsub(",", ".", df_year0$DENSITE..habitants.km2.)) #convert density to numeric
names(df_year0)[names(df_year0)=="DENSITE..habitants.km2."] <- 'density'
Certain montants de sinistres sont négatifs : on les exclus de la base et on ne modélise alors que les montants bruts de recours. Des clients peuvent avoir plusieurs sinsitres sur le même contrat dans la même année : pour l’étude statistique du nombre de sinistre, on somme le nombre de sinistre par client.
#df_claims[df_claims$claim_amount>300000,] #one claim greater than 300K
df_claims_0 = df_claims[(df_claims$claim_amount >= 0),]
#df_claims_0 = df_claims[(df_claims$claim_amount >= 0) & (df_claims$claim_amount < 300000), ] #%>% group_by(id_policy) %>% summarise(claim_nb = n(), claim_amount = sum(claim_amount))
summary(df_claims_0$claim_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.32 115.96 364.56 1050.83 1117.56 315780.13
plot1 <- plot(ecdf(df_claims_0$claim_amount), xlab = "Claim Amount", ylab = "Fn(Claim Amount)")
plot2 <- plot(ecdf(log(df_claims_0$claim_amount)), xlab = "Log of Claim Amount", ylab = "Fn(Claim Amount)")
Nous pouvons également réaliser ces grpahiques avec plot_ly, et nous préférerons même ce package qui offre des fonctionnalités intéréssantes (zoom dynamique, résumé pointeur…).
fig1 <-plot_ly(data= arrange(df_claims_0,df_claims_0$claim_amount), x = ~claim_amount, y = ~ecdf(claim_amount)(claim_amount), type= 'scatter', mode ='lines', name ='ecdf of claim amount')
fig2 <- plot_ly(data= arrange(df_claims_0,df_claims_0$claim_amount), x = ~log(claim_amount), y = ~ecdf(log(claim_amount))(log(claim_amount)), type= 'scatter', mode ='lines', name ='ecdf of log claim amount')
fig <- subplot(fig1, fig2) %>%
layout(title = 'Fonction de répartition empirique')
fig
# create final merged df
df = left_join(df_year0, df_claims_0, by = c("id_policy")) %>%
mutate(claim_nb = ifelse(is.na(claim_nb), 0, claim_nb), claim_amount=ifelse(is.na(claim_amount), 0, claim_amount))
df <- df[df$claim_amount > 0,] #keep individuals claims for modelisation
fig1 <- plot_ly(y = ~df$claim_amount, type = "box", name = 'boxplot of claim amount') %>% layout( yaxis = list(range = c(0,4000)))
fig2 <- plot_ly(y = ~ log(df$claim_amount), type = "box", name= 'boxplot of log claim amount')
fig <- subplot(fig1, fig2) %>%
layout(title = 'Boxplot des montants et logs montants')
fig
plot_ly(x = df$drv_age1 ,y = ~df$claim_amount, type = "box", name = 'Boxplot against driver age',) %>% layout(title = 'Boxplot against driver age', xaxis = list(title = 'Driver age', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
df$drv_age1G <- cut(df$drv_age1, c(17, 2:8*10, 100))
plot_ly(data =df, x = ~drv_age1G, y = ~claim_amount, type = "box", name = 'Boxplot against driver age') %>% layout(title = 'Boxplot against driver age', xaxis = list(title = 'Driver age', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
## Warning: Ignoring 1 observations
df$drv_age2G <- cut(df$drv_age1, c(17, 25, 45,100))
plot_ly(data =df, x = ~drv_age2G, y = ~claim_amount, type = "box", name = 'Boxplot against driver age') %>% layout(title = 'Boxplot against driver age', xaxis = list(title = 'Driver age', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
## Warning: Ignoring 1 observations
plot_ly(data =df, x = ~vh_age, y = ~claim_amount, type = "box") %>% layout(title = 'Boxplot against vehicle age', xaxis = list(title = 'Boxplot against Vehicle age', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
df$vh_ageG <- cut(df$vh_age, c(0, 15, 100)) #no vehicle with age 0, min value is 1 in the dataset
plot_ly(data =df, x = ~vh_ageG, y = ~claim_amount, type = "box") %>% layout(title = 'Boxplot against vehicle age', xaxis = list(title = 'Boxplot against Vehicle age', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
Nous ne distinguons pas de réelles différences, ajoutons quelques catégories.
df$vh_ageG <- cut(df$vh_age, c(0, 10, 25, 30, 100)) #no vehicle with age 0, min value is 1 in the dataset
plot_ly(data =df, x = ~vh_ageG, y = ~claim_amount, type = "box") %>% layout(title = 'Boxplot against vehicle age', xaxis = list(title = 'Boxplot against Vehicle age', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
plot_ly(x = df$vh_type ,y = ~df$claim_amount, type = "box", name = 'Boxplot against vehicle type') %>% layout(title = 'Boxplot against vehicle type', xaxis = list(title = 'Vehicle type', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
##Boxplot power {.tabset}
df$vh_dinG = cut(df$vh_din, c(min(df$vh_din), 50, 220, 290, max(df$vh_din)), include.lowest = TRUE)
plot_ly(data= df, x = ~vh_dinG ,y = ~claim_amount, type = "box") %>% layout(title = 'Boxplot against vehicle power', xaxis = list(title = 'Vehicle power', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
#plot_ly(x = df$vh_cyl ,y = ~df$claim_amount, type = "box", name = 'Boxplot against vehicle power')
Pas beaucoup d’intérets à différencier les groupes 3 et 4.
df$vh_dinG = cut(df$vh_din, c(min(df$vh_din), 50, 220, max(df$vh_din)), include.lowest = TRUE)
plot_ly(data= df, x = ~vh_dinG ,y = ~claim_amount, type = "box", name = 'Boxplot against vehicle power') %>% layout(title = 'Boxplot against vehicle power', xaxis = list(title = 'Vehicle power', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
plot_ly(data= df, x = ~vh_make ,y = ~claim_amount, type = "box", name = 'Boxplot against vehicle brand') %>% layout(title = 'Boxplot against vehicle brand', xaxis = list(title = 'Vehicle brand', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
df$vh_makeG = ifelse(df$vh_make %in% c("RENAULT", "NISSAN", "CITROEN"), "A",
ifelse(df$vh_make %in% c("VOLKSWAGEN", "AUDI", "SKODA", "SEAT"), "B",
ifelse(df$vh_make %in% c("OPEL", "GENERAL MOTORS", "FORD"), "C",
ifelse(df$vh_make %in% c("FIAT"), "D",
ifelse(df$vh_make %in% c("MERCEDES BENZ", "BMW", "CITROEN"), "E", "G")))))
plot_ly(data= df, x = ~vh_makeG ,y = ~claim_amount, type = "box", name = 'Boxplot against vehicle brand') %>% layout(title = 'Boxplot against vehicle brand', xaxis = list(title = 'Vehicle brand groupped', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
df$vh_valueG = cut(df$vh_value, c(min(df$vh_value), 4:12*2000 , max(df$vh_value)), include.lowest = TRUE)
plot_ly(data= df, x = ~vh_valueG ,y = ~claim_amount, type = "box") %>% layout(title = 'Boxplot against vehicle value', xaxis = list(title = 'Region', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
df$vh_valueG = cut(df$vh_value, c(min(df$vh_value), 8000, 15000, 22000, max(df$vh_value)), include.lowest = TRUE)
plot_ly(data= df, x = ~vh_valueG ,y = ~claim_amount, type = "box") %>% layout(title = 'Boxplot against vehicle value', xaxis = list(title = 'Region', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
plot_ly(data= df, x = ~REGION ,y = ~claim_amount, type = "box") %>% layout(title = 'Boxplot against region', xaxis = list(title = 'Region', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
df$densityG<- cut(df$density,c(0,40,200,500,4500,Inf),
include.lowest = TRUE)
plot_ly(data= df, x = ~densityG ,y = ~claim_amount, type = "box") %>% layout(title = 'Boxplot against density', xaxis = list(title = 'Density', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
region_df <- df %>% #pas concluant on peut tej
group_by(REGION) %>%
mutate(vh_fuel = factor(vh_fuel)) %>%
summarize(NrObs = length(claim_amount),
VehAge = mean(vh_age),
DrivAge = mean(drv_age1),
BonusMalus = mean(pol_bonus),
Density = mean(density))
knitr::kable(head(region_df, n = 10))
| REGION | NrObs | VehAge | DrivAge | BonusMalus | Density |
|---|---|---|---|---|---|
| Auvergne-Rhône-Alpes | 1768 | 7.865950 | 53.95136 | 0.5452771 | 203.40170 |
| Bourgogne-Franche-Comté | 531 | 7.934087 | 54.22787 | 0.5423164 | 69.68192 |
| Bretagne | 720 | 7.313889 | 55.33056 | 0.5291389 | 124.53264 |
| Centre-Val de Loire | 496 | 7.647177 | 54.21573 | 0.5462702 | 77.53065 |
| Corse | 40 | 8.000000 | 55.77500 | 0.5615000 | 37.97000 |
| Grand Est | 1255 | 7.835857 | 53.65657 | 0.5455139 | 137.24566 |
| Hauts-de-France | 1198 | 8.020868 | 53.86978 | 0.5455259 | 311.00067 |
| Ile-de-France | 2066 | 7.843175 | 53.70571 | 0.5573427 | 6517.93049 |
| Normandie | 498 | 6.833333 | 53.29920 | 0.5387349 | 135.81386 |
| Nouvelle-Aquitaine | 1474 | 7.875848 | 55.52510 | 0.5337653 | 85.06092 |
df$pol_bonusG = cut(df$pol_bonus, c(0.5, 0.9, 1.2, 1.4, max(df$pol_bonus)), include.lowest = TRUE)
plot_ly(data= df, x = ~pol_bonusG ,y = ~claim_amount, type = "box") %>% layout(title = 'Boxplot against Bonus', xaxis = list(title = 'Bonus', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
df$pol_durationG = cut(df$pol_duration, c(min(df$pol_duration),10, 20,25,30, max(df$pol_duration)), include.lowest = TRUE)
plot_ly(data= df, x = ~pol_durationG ,y = ~claim_amount, type = "box") %>% layout(title = 'Boxplot against policy duration', xaxis = list(title = 'Policy duration', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
df$pol_coverageG = ifelse(df$pol_coverage %in% c("Median1", "Median2"), "Median", df$pol_coverage)
plot_ly(data= df, x = ~pol_coverageG ,y = ~claim_amount, type = "box") %>% layout(title = 'Boxplot against policy duration', xaxis = list(title = 'Policy duration', zeroline = TRUE), yaxis = list(title = 'Claim Amount (€)',range = c(0,4000)))
mcor = cor(subset(df[, unlist(lapply(df, is.numeric))], select = -claim_nb))
corrplot(mcor, type="upper", order="hclust", tl.col="black", tl.srt=45)
df_sorted <- arrange(df, df$claim_amount )
L’étude de la matrice de corrélation montre que certaines variables sont très corrélées entre elles (vh_din, vh_cyl ou encore vh_value). Nous pouvons donc sélectionner parmis ces variables celles qui nous semblent les plus appropriées pour dans la régression des montants de sinistres. D’après nos boxplots, pvh_din semble porter plus d’information.
amountvsnb <- function(x, add=FALSE, horiz=FALSE, main, ...)
{
n <- length(x)
if(missing(main))
main <- "aggregate amount vs. claim number"
if(!add)
{
z <- cumsum(sort(x))/sum(x)
y <- (1:n)/n
plot(z, y, type="l", ylim=0:1,
ylab="% claim nb.", xlab="% agg. claim amount",
main=main, ...)
if(horiz)
abline(h=1:10/10, lty=3, col="grey")
else
for(i in 3:19)
{
p <- i/20
ind <- head(which(abs(y - p) < 1e-3), 1)
lines(c(-1, z[ind]), c(p, p), lty=2, col="grey25")
lines(c(z[ind], z[ind]), c(-1, p), lty=2, col="grey25")
print(c(p, z[ind]))
}
}else
lines(cumsum(sort(x))/sum(x), (1:n)/n, ...)
}
amountvsnb(df$claim_amount)
## [1] 0.15000000 0.00421052
## [1] 0.200000000 0.007645753
## [1] 0.2500000 0.0123984
## [1] 0.30000000 0.01868896
## [1] 0.35000000 0.02675409
## [1] 0.40000000 0.03689271
## [1] 0.45000000 0.04947674
## [1] 0.5000000 0.0651717
## [1] 0.55000000 0.08449168
## [1] 0.60000 0.10815
## [1] 0.6500000 0.1374753
## [1] 0.7000000 0.1743353
## [1] 0.7500000 0.2210648
## [1] 0.8000000 0.2813181
## [1] 0.8500000 0.3600849
## [1] 0.9000000 0.4669011
## [1] 0.9500000 0.6220907
amountvsnb <- function(x, add=FALSE, horiz=FALSE, main,i=1, ...)
{
cols <- cols25()
x <- x[is.na(x)==0]
n <- length(x)
if(missing(main))
main <- "aggregate amount vs. claim number"
if(!add)
{
z <- cumsum(sort(x))/sum(x)
y <- (1:n)/n
plot(z, y, type="l", ylim=0:1,
ylab="% claim nb.", xlab="% agg. claim amount",
main=main,col = cols[i],lty=2, ...)
if(horiz)
abline(h=1:10/10, lty=3, col="grey")
else
for(i in 3:19)
{
p <- i/20
ind <- head(which(abs(y - p) < 1e-3), 1)
lines(c(-1, z[ind]), c(p, p), lty=2, col="grey25")
lines(c(z[ind], z[ind]), c(-1, p), lty=2, col="grey25")
print(c(p, z[ind]))
}
}else
lines(cumsum(sort(x))/sum(x), (1:n)/n, col=cols[i], lty=2)
}
amountvsnbcat <- function(x, column, main)
{cols <- cols25()
# set the colour palette
len=length(levels(df[[column]]))
amountvsnb(df[df[[column]]==levels(df[[column]])[1], 'claim_amount'], horiz = TRUE, i=1, main=main)
for (i in 2:len){
amountvsnb(df[df[[column]]==levels(df[[column]])[i], 'claim_amount'], horiz = TRUE, add= TRUE, i=i)
}
legend("bottomright", legend = levels(df[[column]]), cex = 0.8, col=cols[1:len],lty=2)
}
amountvsnbcat(df, "drv_age2G", main= "agg./nb. per Driver Age group")
#amountvsnbcat(df, 'Power', main= "agg./nb. per Power")
amountvsnbcat(df, 'vh_ageG', main= "agg./nb. per Car Age group")
df$vh_fuel = factor(df$vh_fuel)
amountvsnbcat(df, 'vh_fuel', main= "agg./nb. per Vehicle Fuel type")